# Getting current date and time using now().
# importing datetime module for now()
import datetime
# using now() to get current time
current_time = datetime.datetime.now()
# Printing value of now.
print("Time now is:", current_time)
The problem statement revolves around the detection of fraud in auto insurance claims, which is a critical issue faced by many general insurance companies. The fraudulent claims not only result in significant leakages for the insurer but also affect innocent people's lives. The insurance frauds can be classified based on their sources, including policyholders, intermediaries, and internal factors, and their nature, such as application, inflation, identity, fabrication, and staged accidents. Detecting and preventing frauds in auto insurance claims require a robust analytical and modelling framework that can predict the likelihood of fraud before processing the claims. The framework should also be capable of identifying the hidden patterns in the data that lead to fraudulent claims.
We are expected to perform exploratory data analysis, report the results from learning curves, build an analytical framework to predict fraud, and extract the top 20 patterns for fraudulent claims using decision tree algorithms only. This analytical and modelling framework can benefit not only the insurance companies but also the regulatory bodies and law enforcement agencies.
Objectives :
import os # Provides functions for creating and removing a directory (folder), fetching its contents, changing and identifying the current directory, etc.
import pandas as pd # To perform Operations on DataFrames.
import numpy as np # Perform a number of mathematical operations on arrays such as statistical, and algebraic.
import matplotlib.pyplot as plt
import missingno as msno
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
We are provided with multiple CSV files, which are to be merged and used for Model Building.
Demographics Data:
Policy Information:
Claim Information:
Data of Vehicle:
Fraud Data :
We are provided with 5 different files for Training Data.
Files are:
Note:
train_demographic = pd.read_csv("../Train Data/Train_Demographics.csv",na_values=['NA'])
train_policy = pd.read_csv("../Train Data/Train_Policy.csv",na_values=['NA', '-1', 'MISSINGVAL'])
train_claim = pd.read_csv("../Train Data/Train_Claim.csv",na_values=['?', '-5', 'MISSINGVALUE', 'MISSEDDATA'])
train_vehicle = pd.read_csv("../Train Data/Train_Vehicle.csv" ,na_values=['???'])
train_target = pd.read_csv("../Train Data/Traindata_with_Target.csv")
# print the shapes of the dataframes
print('Shape of train_demographic:', train_demographic.shape)
print('Shape of train_policy:', train_policy.shape)
print('Shape of train_claim:', train_claim.shape)
print('Shape of train_vehicle:', train_vehicle.shape)
print('Shape of train_target:', train_target.shape)
# print the shapes of the dataframes
print('columns of train_demographic:', train_demographic.columns)
print("♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦")
print('columns of train_policy:', train_policy.columns)
print("♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦")
print('columns of train_claim:', train_claim.columns)
print("♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦")
print('columns of train_vehicle:', train_vehicle.columns)
print("♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦")
print('columns of train_target:', train_target.columns)
In this datarame for every "CustomerID" there are four different values in the subsequent columns. We need to transform the unique values in "VehicleAttribute" column to individual columns and assign their respective value from "VehicleAttributeDetails" column.
train_vehicle.head()
# pivot the dataframe to get unique values of VehicleAttribute as columns and VehicleAttributeDetails as the values
train_vehicle = train_vehicle.pivot_table(index='CustomerID', columns='VehicleAttribute', values='VehicleAttributeDetails', aggfunc='first')
train_vehicle.head()
# removing the VehicleAttribute as index
train_vehicle = train_vehicle.rename_axis(None, axis=1)
# reset the index to convert the pivot table to a regular dataframe
train_vehicle = train_vehicle.reset_index()
# print the resulting head of the dataframe and its shape
display(train_vehicle.head(10))
print("\n")
print('Shape of train_vehicle:', train_vehicle.shape)
# print the shapes of the dataframes
print('Shape of train_demographic:', train_demographic.shape)
print('Shape of train_policy:', train_policy.shape)
print('Shape of train_claim:', train_claim.shape)
print('Shape of train_vehicle:', train_vehicle.shape)
print('Shape of train_target:', train_target.shape)
# merge the dataframes based on the CustomerID column
merged_df = pd.merge(train_demographic, train_policy, on='CustomerID')
merged_df = pd.merge(merged_df, train_claim, on='CustomerID')
merged_df = pd.merge(merged_df, train_vehicle, on='CustomerID')
train_df = pd.merge(merged_df, train_target, on='CustomerID')
display(train_df.head(10))
print("\n")
print('Shape of train_df:', train_df.shape)
test_demographic = pd.read_csv("../Test Data/Test_Demographics.csv",na_values=['NA'])
test_policy = pd.read_csv("../Test Data/Test_Policy.csv",na_values=['NA', '-1', 'MISSINGVAL'])
test_claim = pd.read_csv("../Test Data/Test_Claim.csv",na_values=['?', '-5', 'MISSINGVALUE', 'MISSEDDATA'])
test_vehicle = pd.read_csv("../Test Data/Test_Vehicle.csv" ,na_values=['???'])
test_target = pd.read_csv("../Test Data/Test.csv")
# print the shapes of the dataframes
print('Shape of test_demographic:', test_demographic.shape)
print('Shape of test_policy:', test_policy.shape)
print('Shape of test_claim:', test_claim.shape)
print('Shape of test_vehicle:', test_vehicle.shape)
print('Shape of test_target:', test_target.shape)
# print the shapes of the dataframes
print('columns of test_demographic:', test_demographic.columns)
print("♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦")
print('columns of test_policy:', test_policy.columns)
print("♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦")
print('columns of test_claim:', test_claim.columns)
print("♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦")
print('columns of test_vehicle:', test_vehicle.columns)
print("♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦")
print('columns of test_target:', test_target.columns)
In this datarame for every "CustomerID" there are four different values in the subsequent columns. We need to transform the unique values in "VehicleAttribute" column to individual columns and assign their respective value from "VehicleAttributeDetails" column.
# pivot the dataframe to get unique values of VehicleAttribute as columns and VehicleAttributeDetails as the values
test_vehicle = test_vehicle.pivot_table(index='CustomerID', columns='VehicleAttribute', values='VehicleAttributeDetails', aggfunc='first')
test_vehicle.head()
# removing the VehicleAttribute as index
test_vehicle = test_vehicle.rename_axis(None, axis=1)
# reset the index to convert the pivot table to a regular dataframe
test_vehicle = test_vehicle.reset_index()
# print the resulting head of the dataframe and its shape
display(test_vehicle.head(10))
print("\n")
print('Shape of train_vehicle:', test_vehicle.shape)
# print the shapes of the dataframes
print('Shape of test_demographic:', test_demographic.shape)
print('Shape of test_policy:', test_policy.shape)
print('Shape of test_claim:', test_claim.shape)
print('Shape of test_vehicle:', test_vehicle.shape)
print('Shape of test_target:', test_target.shape)
# merge the dataframes based on the CustomerID column
merged_df = pd.merge(test_demographic, test_policy, on='CustomerID')
merged_df = pd.merge(merged_df, test_claim, on='CustomerID')
merged_df = pd.merge(merged_df, test_vehicle, on='CustomerID')
test_df = pd.merge(merged_df, test_target, on='CustomerID')
display(test_df.head(10))
print("\n")
print('Shape of test_df:', test_df.shape)
Steps for Data Pre-Processing:
a. train_test_split
b. **Outcome:**
* X_train, X_test, y_train, y_test
a. **Outcome:**
* X_train_cat, X_train_num, X_test_cat, X_test_num
a. Impute missing values (Mode)
* Fit on X_train_cat
* Transform on X_train_cat & X_test_cat
b. Label Encoding
* Only for columns which have ordinality or the target column.
c. One-hot Encoding
* Only for columns with no ordinality or the target column.
* Better at handling errors, or new values generated when compared to get_dummies.
* Can do fit_transform directly.
a. Impute missing values (Mean, Median)
* Fit on X_train_num
* Transform on X_train_num & X_test_num
b. Standardization:
* Fit on X_train_num
* Transform on X_train_num & X_test_num
c. Normalization:
* Fit on X_train_num
* Transform on X_train_num & X_test_num
We do the following steps in this part.
Steps:
Note:
display(train_df.head())
# create two new columns by splitting the values column
train_df[['SplitLimit', 'CombinedSingleLimit']] = train_df['Policy_CombinedSingleLimit'].str.split('/', expand=True)
# convert the columns to the appropriate data types if necessary
train_df['SplitLimit'] = train_df['SplitLimit'].astype(int)
train_df['CombinedSingleLimit'] = train_df['CombinedSingleLimit'].astype(int)
# dropping the original column
train_df.drop("Policy_CombinedSingleLimit", axis=1, inplace=True)
display(train_df.head())
print('Shape of train_df:', train_df.shape)
display(test_df.head())
# create two new columns by splitting the values column
test_df[['SplitLimit', 'CombinedSingleLimit']] = test_df['Policy_CombinedSingleLimit'].str.split('/', expand=True)
# convert the columns to the appropriate data types if necessary
test_df['SplitLimit'] = test_df['SplitLimit'].astype(int)
test_df['CombinedSingleLimit'] = test_df['CombinedSingleLimit'].astype(int)
# dropping the original column
test_df.drop("Policy_CombinedSingleLimit", axis=1, inplace=True)
display(test_df.head())
print('Shape of test_df:', test_df.shape)
In this step, we change the columns to their appropriate data-types as mentioned below.
Columns and their data-type:
train_df.dtypes
# function to convert columns of a dataframe into metioned data-type
def convert_columns_types_to_category(DataFrame, cols=None, col_type = None):
display('### Before conversion: ###', DataFrame.dtypes) # Checking the Data Types of columns before converting
DataFrame[cols] = DataFrame[cols].astype(col_type) # Changing the Data Types using astype() function
display('### After conversion: ###', DataFrame.dtypes) # Checking the Data Types of columns before converting
return DataFrame
# categorical converrsion
# getting categoric columns in ca_cols variable
cat_cols = ['CustomerID', 'InsuredZipCode', 'InsuredGender', 'InsuredEducationLevel',
'InsuredOccupation', 'InsuredHobbies', 'Country', 'InsurancePolicyNumber',
'InsurancePolicyState', 'InsuredRelationship', 'TypeOfIncident', 'TypeOfCollission',
'SeverityOfIncident', 'AuthoritiesContacted', 'IncidentState', 'IncidentCity',
'IncidentAddress', 'PropertyDamage', 'PoliceReport', 'VehicleID',
'VehicleMake', 'VehicleModel', 'ReportedFraud']
# calling the convert_columns_types_to_category() function defined above
train_df = convert_columns_types_to_category(train_df, cols=cat_cols, col_type = 'category')
# datetime converrsion
# usning the pandas to_datetime() function we conver the columns into date-time format
train_df['DateOfPolicyCoverage'] = pd.to_datetime(train_df['DateOfPolicyCoverage'], format='%d-%m-%Y')
train_df['DateOfIncident'] = pd.to_datetime(train_df['DateOfIncident'], format='%d-%m-%Y')
train_df['VehicleYOM'] = pd.to_datetime(train_df['VehicleYOM'], format='%Y')
train_df.dtypes
test_df.dtypes
# categorical converrsion
# getting categoric columns in ca_cols variable
cat_cols = ['CustomerID', 'InsuredZipCode', 'InsuredGender', 'InsuredEducationLevel',
'InsuredOccupation', 'InsuredHobbies', 'Country', 'InsurancePolicyNumber',
'InsurancePolicyState', 'InsuredRelationship', 'TypeOfIncident', 'TypeOfCollission',
'SeverityOfIncident', 'AuthoritiesContacted', 'IncidentState', 'IncidentCity',
'IncidentAddress', 'PropertyDamage', 'PoliceReport', 'VehicleID',
'VehicleMake', 'VehicleModel']
# calling the convert_columns_types_to_category() function defined above
test_df = convert_columns_types_to_category(test_df, cols=cat_cols, col_type = 'category')
# datetime converrsion
# usning the pandas to_datetime() function we conver the columns into date-time format
test_df['DateOfPolicyCoverage'] = pd.to_datetime(test_df['DateOfPolicyCoverage'], format='%Y-%m-%d')
test_df['DateOfIncident'] = pd.to_datetime(test_df['DateOfIncident'], format='%Y-%m-%d')
test_df['VehicleYOM'] = pd.to_datetime(test_df['VehicleYOM'], format='%Y')
test_df.dtypes
By looking at the data we can determine columns which can be dropped before even building models.
Such Columns are:
# Function to Drop Unccessary Columns in a Data Frame
def drop_unnecessary_columns(DataFrame, cols=None):
display('Before Dropping : ', DataFrame.columns) # Printing columns names before dropping selected columns
print("♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦")
DataFrame.drop(columns = cols, axis = 1, inplace = True) # Dropping the columns
print("♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦-♦")
display('After Dropping : ',DataFrame.columns) # Printing columns names after dropping selected columns
return DataFrame
# storing the columns which are to be dropped in drop_cols
drop_cols = ['CustomerID', 'Country', 'InsurancePolicyNumber', 'VehicleID']
# calling the function defined above to drop the columns
train_df = drop_unnecessary_columns(train_df, cols=drop_cols)
# storing the columns which are to be dropped in drop_cols
drop_cols = ['CustomerID', 'Country', 'InsurancePolicyNumber', 'VehicleID']
# calling the function defined above to drop the columns
test_df = drop_unnecessary_columns(test_df, cols=drop_cols)
# this function checks for duplicate records in the dataframe
def check_duplicates(df):
# Check for duplicate records
duplicate_rows = df[df.duplicated()]
# Check if there are any duplicate records
if duplicate_rows.empty:
print("There are no duplicate records.")
else:
print("Duplicate records:")
print(duplicate_rows)
check_duplicates(train_df)
check_duplicates(test_df)
# Derive Age of Vehicle on the day of Incident
train_df['VehicleAge'] = (train_df['DateOfIncident'] - train_df['VehicleYOM']).dt.days
# Derive Day of the Week of Incident
train_df['DayOfWeek'] = train_df['DateOfIncident'].dt.day_name()
# Derive Month of Incident
train_df['MonthOfIncident'] = train_df['DateOfIncident'].dt.month_name()
# Derive Time between Policy Coverage and Incident
train_df['TimeBetweenCoverageAndIncident'] = (train_df['DateOfIncident'] - train_df['DateOfPolicyCoverage']).dt.days
train_df.shape
train_df.drop(['DateOfIncident', 'VehicleYOM', 'DateOfPolicyCoverage'], axis=1, inplace=True)
train_df.head()
# Change the dtype of 'DayOfWeek', 'MonthOfIncident' column from object to category
train_df['DayOfWeek'] = train_df['DayOfWeek'].astype('category')
train_df['MonthOfIncident'] = train_df['MonthOfIncident'].astype('category')
train_df.head()
train_df.dtypes
# Derive Age of Vehicle on the day of Incident
test_df['VehicleAge'] = (test_df['DateOfIncident'] - test_df['VehicleYOM']).dt.days
# Derive Day of the Week of Incident
test_df['DayOfWeek'] = test_df['DateOfIncident'].dt.day_name()
# Derive Month of Incident
test_df['MonthOfIncident'] = test_df['DateOfIncident'].dt.month_name()
# Derive Time between Policy Coverage and Incident
test_df['TimeBetweenCoverageAndIncident'] = (test_df['DateOfIncident'] - test_df['DateOfPolicyCoverage']).dt.days
test_df.drop(['DateOfIncident', 'VehicleYOM', 'DateOfPolicyCoverage'], axis=1, inplace=True)
# Change the dtype of 'DayOfWeek', 'MonthOfIncident' column from object to category
test_df['DayOfWeek'] = test_df['DayOfWeek'].astype('category')
test_df['MonthOfIncident'] = test_df['MonthOfIncident'].astype('category')
test_df.head()
import seaborn as sns
# Visualizing the correlation matrix by plotting heat map.
plt.style.use('seaborn-pastel')
upper_triangle = np.triu(train_df.corr())
plt.figure(figsize=(25,25))
sns.heatmap(train_df.corr(), vmin=-1, vmax=1, annot=True, square=True, fmt='0.2f',
annot_kws={'size':16}, cmap="plasma", mask=upper_triangle)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()
This heatmap shows the correlation matrix by visualizing the data. we can observe the relation between one feature to other.
This heatmap contains both positive and negative correlation.
We can observe the most of the columns are highly correlated with each other which leads to the multicollinearity problem.
# Visualizing the correlation matrix by plotting heat map.
plt.style.use('seaborn-pastel')
upper_triangle = np.triu(test_df.corr())
plt.figure(figsize=(25,25))
sns.heatmap(test_df.corr(), vmin=-1, vmax=1, annot=True, square=True, fmt='0.2f',
annot_kws={'size':16}, cmap="plasma", mask=upper_triangle)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()
msno.matrix(train_df)
plt.figure(figsize = (15,9))
plt.show()
msno.matrix(test_df)
plt.figure(figsize = (15,9))
plt.show()
Findings:
columns= ['InsuredAge',
'CapitalGains',
'CapitalLoss',
'CustomerLoyaltyPeriod',
'Policy_Deductible',
'PolicyAnnualPremium',
'UmbrellaLimit',
'IncidentTime',
'NumberOfVehicles',
'BodilyInjuries',
'Witnesses',
'AmountOfTotalClaim',
'AmountOfInjuryClaim',
'AmountOfPropertyClaim',
'AmountOfVehicleDamage',
'SplitLimit',
'CombinedSingleLimit',
'VehicleAge']
def plot_boxplot_side_by_side(df1, df2, columns):
# Create two subplots
fig, axs = plt.subplots(nrows=len(columns), ncols=2, figsize=(15, 5*len(columns)))
for i, col in enumerate(columns):
# Draw the first plot on the left
sns.boxplot(x=df1[col], ax=axs[i, 0],color="teal")
axs[i, 0].set_xlabel(col + " (Train)")
# Draw the second plot on the right
sns.boxplot(x=df2[col], ax=axs[i, 1],color="indigo")
axs[i, 1].set_xlabel(col + " (Test)")
# Show the plot
plt.show()
plot_boxplot_side_by_side(train_df, test_df, columns)
Findings:
# columns= ['InsuredAge',
# 'CapitalGains',
# 'CapitalLoss',
# 'CustomerLoyaltyPeriod',
# 'Policy_Deductible',
# 'PolicyAnnualPremium',
# 'UmbrellaLimit',
# 'IncidentTime',
# 'NumberOfVehicles',
# 'BodilyInjuries',
# 'Witnesses',
# 'AmountOfTotalClaim',
# 'AmountOfInjuryClaim',
# 'AmountOfPropertyClaim',
# 'AmountOfVehicleDamage',
# 'SplitLimit',
# 'CombinedSingleLimit',
# 'VehicleAge']
# def plot_kdeplot_side_by_side(df1, df2, columns):
# # Create two subplots
# fig, axs = plt.subplots(nrows=len(columns), ncols=2, figsize=(15, 5*len(columns)))
# for i, col in enumerate(columns):
# # Draw the first plot on the left
# sns.kdeplot(df1[col], ax=axs[i, 0], fill=True, color='teal')
# axs[i, 0].set_xlabel(col + " (Train)")
# # Draw the second plot on the right
# sns.kdeplot(df2[col], ax=axs[i, 1], fill=True, color='indigo')
# axs[i, 1].set_xlabel(col + " (Test)")
# # Show the plot
# plt.show()
# plot_kdeplot_side_by_side(train_df, test_df, columns)
# Checking the skewness in our DataFrame
train_df.skew().sort_values()
Findings:
def generate_pie(df):
plt.figure(figsize=(10,5))
plt.pie(df.value_counts(), labels=df.value_counts().index, autopct='%1.2f%%',shadow=True, explode=(0,.2), colors=['#006400' ,'#B22222'])
plt.legend(prop={'size':9})
plt.axis('equal')
return plt.show()
generate_pie(train_df.ReportedFraud)
From the plot we can observe that the count of "N" is high compared to "Y". Here most of the insurance claims have not reported as fradulent, which is usually the case.
Since it is our target column, it indicates the class imbalance issue. We will balance the data using oversampling method in later part.
# getting categoric columns in ca_cols variable
cat_cols = ['InsuredGender', 'InsuredEducationLevel',
'InsurancePolicyState', 'InsuredRelationship', 'TypeOfIncident',
'TypeOfCollission', 'SeverityOfIncident', 'AuthoritiesContacted',
'PropertyDamage', 'PoliceReport',
'MonthOfIncident']
def generate_pie(df):
plt.figure(figsize=(8,3))
plt.pie(df.value_counts(), labels=df.value_counts().index, autopct='%1.2f%%',shadow=True)
plt.legend(prop={'size':9})
plt.axis('equal')
return plt.show()
for col in train_df[cat_cols]:
print(f"Pie plot for the column:", col)
generate_pie(train_df[col])
Findings:
# getting categoric columns in ca_cols variable
cat_cols = ['InsuredOccupation', 'InsuredHobbies', 'IncidentState', 'IncidentCity', 'VehicleMake', 'DayOfWeek', 'MonthOfIncident']
def plot_countplots(df, columns):
# Calculate the number of rows and columns needed for the plot
n_cols = 2
n_rows = (len(columns) + 1) // 2
# Create the subplot grid
fig, axs = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(12, 4*n_rows))
plt.xticks(rotation=90)
# Flatten the subplot grid
axs = axs.flatten()
# Loop over the columns and plot a countplot for each one
for i, col in enumerate(columns):
sns.countplot(x=col, data=df, ax=axs[i],palette="gnuplot", )
axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation=90)
# Remove any unused subplots
if len(columns) < n_rows * n_cols:
for i in range(len(columns), n_rows * n_cols):
fig.delaxes(axs[i])
# Display the plot
plt.tight_layout()
plt.show()
plot_countplots(train_df, cat_cols)
Finding:
# Comparision between two variables
plt.figure(figsize=[18,13])
plt.subplot(2,2,1)
plt.title('Comparision CustomerLoyaltyPeriod and InsuredAge')
sns.scatterplot(x=train_df['CustomerLoyaltyPeriod'],y=train_df['InsuredAge'],hue=train_df['ReportedFraud'],palette="gnuplot");
plt.subplot(2,2,2)
plt.title('Comparision between AmountOfTotalClaim and AmountOfInjuryClaim')
sns.scatterplot(x=train_df['AmountOfTotalClaim'],y=train_df['AmountOfInjuryClaim'],hue=train_df['ReportedFraud'],palette="gnuplot");
plt.subplot(2,2,3)
plt.title('Comparision between AmountOfPropertyClaim and AmountOfVehicleDamage')
sns.scatterplot(x=train_df['AmountOfPropertyClaim'],y=train_df['AmountOfVehicleDamage'],hue=train_df['ReportedFraud'],palette="gnuplot");
plt.subplot(2,2,4)
plt.title('Comparision between CustomerLoyaltyPeriod and AmountOfTotalClaim')
sns.scatterplot(x=train_df['CustomerLoyaltyPeriod'],y=train_df['AmountOfTotalClaim'],hue=train_df['ReportedFraud'],palette="gnuplot");
From the above scatter plot we can observe the following things:
fig,axes=plt.subplots(2,2,figsize=(12,10))
# Comparing insured_sex and age
sns.violinplot(x='InsuredGender',y='InsuredAge',ax=axes[0,0],data=train_df,palette="ch:.25",hue="ReportedFraud",split=True)
# Comparing policy_state and witnesses
sns.violinplot(x='InsurancePolicyState',y='Witnesses',ax=axes[0,1],data=train_df,hue="ReportedFraud",split=True,palette="hls")
# Comparing csl_per_accident and property_claim
sns.violinplot(x='CombinedSingleLimit',y='AmountOfPropertyClaim',ax=axes[1,0],data=train_df,hue="ReportedFraud",split=True,palette="Dark2")
# Comparing csl_per_person and age
sns.violinplot(x='SplitLimit',y='InsuredAge',ax=axes[1,1],data=train_df,hue="ReportedFraud",split=True,palette="mako")
plt.show()
Findings:
fig,axes=plt.subplots(2,2,figsize=(12,12))
# Comparing insured_sex and age
sns.violinplot(x='ReportedFraud',y='AmountOfTotalClaim',ax=axes[0,0],data=train_df,hue="ReportedFraud" ,palette="hls")
# Comparing policy_state and witnesses
sns.violinplot(x='ReportedFraud',y='AmountOfVehicleDamage',ax=axes[0,1],data=train_df,hue="ReportedFraud",palette="cool_r")
# Comparing csl_per_accident and property_claim
sns.violinplot(x='ReportedFraud',y='AmountOfPropertyClaim',ax=axes[1,0],data=train_df,hue="ReportedFraud",palette="cividis")
# Comparing csl_per_person and age
sns.violinplot(x='ReportedFraud',y='AmountOfInjuryClaim',ax=axes[1,1],data=train_df,hue="ReportedFraud",palette="gnuplot2")
plt.show()
Findings:
# Comparing policy_state and fraud_reported
sns.catplot(x='InsurancePolicyState',kind='count',data=train_df,hue='ReportedFraud',palette="Dark2")
plt.show()
Findings:
# Comparing insured_education_level and fraud_reported
sns.catplot(x='InsuredEducationLevel',kind='count',data=train_df,hue='ReportedFraud',palette="tab20b_r")
plt.xticks(rotation=90)
plt.show()
Findings:
# Comparing insured_occupation and fraud_reported
sns.catplot(x='InsuredOccupation',kind='count',data=train_df,hue='ReportedFraud',palette="spring_r")
plt.xticks(rotation=90)
plt.show()
Findings:
# Comparing insured_hobbies and fraud_reported
sns.catplot(x='InsuredHobbies',kind='count',data=train_df,hue='ReportedFraud',palette="gnuplot2")
plt.xticks(rotation=90)
plt.show()
Findings:
# Comparing insured_relationship and fraud_reported
sns.catplot(x='InsuredRelationship',kind='count',data=train_df,hue='ReportedFraud',palette="gist_earth")
plt.xticks(rotation=90)
plt.show()
Findings:
# Comparing incident_type and fraud_reported
sns.catplot(x='TypeOfIncident',kind='count',data=train_df,hue='ReportedFraud',palette="Set2_r")
plt.xticks(rotation=90)
plt.show()
Findings:
# Comparing collision_type and fraud_reported
sns.catplot(x='TypeOfCollission',kind='count',data=train_df,hue='ReportedFraud',palette="gist_earth")
plt.xticks(rotation=90)
plt.show()
Findings:
# Comparing incident_severity and fraud_reported
sns.catplot(x='SeverityOfIncident',kind='count',data=train_df,hue='ReportedFraud',palette="mako")
plt.xticks(rotation=90)
plt.show()
Findings:
# Comparing authorities_contacted and fraud_reported
sns.catplot(x='AuthoritiesContacted',kind='count',data=train_df,hue='ReportedFraud',palette="magma")
plt.xticks(rotation=90)
plt.show()
Findings:
# Comparing incident_state and fraud_reported
sns.catplot(x='IncidentState',kind='count',data=train_df,col='ReportedFraud',palette="cubehelix")
plt.xticks(rotation=90)
plt.show()
Findings:
# Comparing incident_city and fraud_reported
sns.catplot(x='IncidentCity',kind='count',data=train_df,hue='ReportedFraud',palette="bright")
plt.xticks(rotation=90)
plt.show()
Findings:
# Comparing property_damage and fraud_reported
sns.catplot(x='PropertyDamage',kind='count',data=train_df,col='ReportedFraud',palette="ocean")
plt.show()
Findings:
# Comparing police_report_available and fraud_reported
sns.catplot(x='PoliceReport',kind='count',data=train_df,col='ReportedFraud',palette="coolwarm")
plt.xticks(rotation=90)
plt.show()
Findings:
# Comparing auto_make and fraud_reported
sns.catplot(x='VehicleMake',kind='count',data=train_df,hue='ReportedFraud',palette="bright")
plt.xticks(rotation=90)
plt.show()
Findings:
sns.pairplot(train_df,hue="ReportedFraud",palette="ocean")
plt.show()